package com.chance.cc.crawler.development.bootstrap.tencent;

import com.alibaba.fastjson.JSON;
import com.chance.cc.crawler.core.CrawlerEnum;
import com.chance.cc.crawler.core.downloader.HttpConfig;
import com.chance.cc.crawler.core.downloader.proxy.Proxy;
import com.chance.cc.crawler.core.filter.FilterUtils;
import com.chance.cc.crawler.core.record.CrawlerRecord;
import com.chance.cc.crawler.core.record.CrawlerRequestRecord;
import com.chance.cc.crawler.development.utils.RedisReader;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.RandomUtils;
import org.apache.commons.lang3.StringUtils;

import java.io.FileInputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;

import static com.chance.cc.crawler.core.CrawlerEnum.CrawlerRequestType.filter;

/**
 * @author lt
 * @version 1.0
 * @date 2021-04-07 11:04:27
 * @email okprog@sina.com
 */
public class TengXunVideoSimpleStart {

    public static final String domain = "tenxun";
    public static final String site = "video";
    public static final String siteBiz = "video-release";
    public static final String kwSite = "search_keyword";

    public static final RedisReader redisReader = new RedisReader("192.168.1.215", 6379, 4);//

    private static Proxy proxy = new Proxy();
    static {
        //代理配置
        //HL89Q19E86E2987D
        //71F33D94CE5F7BF2
        proxy.setHost("http-dyn.abuyun.com");
        proxy.setPort(9020);
        proxy.setUsername("HL89Q19E86E2987D");
        proxy.setPassword("71F33D94CE5F7BF2");
    }

    public static void main(String[] args) throws Exception {
        send2Redis("文件地址");//
    }

    public static void send2Redis(String filePath) throws Exception {
        List<String> localUrls = IOUtils.readLines(new FileInputStream(filePath), StandardCharsets.UTF_8);
        for (String localUrl : localUrls) {
            try {
                String articleId = localUrl.split("item/")[1];
//                String articleUrl = String.format(iqyArticleUrlFormat, articleId);//
                String articleUrl ="";
                CrawlerRequestRecord requestRecord = genCrawlerArticleRecord(articleUrl);

                String requestKey = requestRecord.getRecordKey();
                //确保key值唯一
                String shaKey = DigestUtils.sha1Hex(System.currentTimeMillis() + requestKey);
                redisReader.mapPush(StringUtils.joinWith("-", "crawler", domain, site, "simple", "queue_map"), shaKey, JSON.toJSONString(requestRecord));
                redisReader.listPush(StringUtils.joinWith("-", "crawler", domain, site, "simple", "queue_list"), JSON.toJSONString(requestRecord));
//                System.out.println("add key : " + shaKey + " success!");

            } catch (Exception e) {
                System.out.println("*----------------*" + localUrl);
                e.printStackTrace();
            }
        }
    }

    private static CrawlerRequestRecord genCrawlerArticleRecord(String articleUrl) {
        CrawlerRequestRecord requestRecord = CrawlerRequestRecord.builder()
                .startPageRequest(domain, CrawlerEnum.CrawlerRequestType.turnPageItem)
                .domain(domain)
                .httpConfig(HttpConfig.me(domain))
                .filter(CrawlerEnum.CrawlerRecordFilter.keyOrDateRange)
                .addFilterInfo(FilterUtils.memoryFilterKeyInfo(domain))
                .addFilterInfo(FilterUtils.dateRangeFilterInfo(24 * 30, null))
                .httpUrl(articleUrl)
                .recordKey(articleUrl)//
                .releaseTime(System.currentTimeMillis())
                .notFilterRecord()
//                .resultLabelTag(CrawlerEnum.CrawlerDataType.article)
                .resultLabelTag(CrawlerEnum.CrawlerDataType.interaction)
                .resultLabelTag(CrawlerEnum.CrawlerDataType.comment)
                .needParsed(true)
                .needWashed(true)
                .proxy(proxy)
                .build();
        requestRecord.getHttpRequest().addHeader("User-Agent",getRandomUA());
        requestRecord.setDownload(false);//
        requestRecord.setSkipPipeline(true);//
        requestRecord.tagsCreator().bizTags().addDomain(domain);
        requestRecord.tagsCreator().bizTags().addSite(site);
        requestRecord.tagsCreator().bizTags().addSiteBiz(siteBiz);


//        //添加评论去重信息
        CrawlerRecord filterCrawlerRecord = new CrawlerRecord();
        filterCrawlerRecord.setFilter(CrawlerEnum.CrawlerRecordFilter.dateRange);
        filterCrawlerRecord.addFilterInfo(FilterUtils.redisFilterKeyInfo(StringUtils.joinWith("-", filter, domain, site, "comment")));
        filterCrawlerRecord.addFilterInfo(FilterUtils.dateRangeFilterInfo(24 * 365 * 10, null));
        requestRecord.tagsCreator().bizTags().addCustomKV("comment_filter_record", JSON.toJSONString(filterCrawlerRecord));
        requestRecord.getHttpRequest().addExtra("searchKwSourceUrl", "https://so.iqiyi.com/so/q_%s_ctg__t_0_page_1_p_1_qc_0_rd__site_iqiyi_m_4_bitrate__af_0");//
//        requestRecord.tagsCreator().scheduleTags().getCategoryTag().addLabelTag(CrawlerEnum.CrawlerDataType.comment.enumVal());
        return requestRecord;
    }

    private static List<String> agentList = new ArrayList<>();

    static {
        agentList.add("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36");
        agentList.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36");
        agentList.add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36");
        agentList.add("Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36");
        agentList.add("Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko");
        agentList.add("Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko");
        agentList.add("Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0");
        agentList.add("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 7.0; InfoPath.3; .NET CLR 3.1.40767; Trident/6.0; en-IN)");
        agentList.add("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)");
        agentList.add("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)");
        agentList.add("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)");
        agentList.add("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/4.0; InfoPath.2; SV1; .NET CLR 2.0.50727; WOW64)");
        agentList.add("Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)");
        agentList.add("Mozilla/4.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)");
        agentList.add("Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) ChromePlus/4.0.222.3 Chrome/4.0.222.3 Safari/532.2");
        agentList.add("Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.28.3 (KHTML, like Gecko) Version/3.2.3 ChromePlus/4.0.222.3 Chrome/4.0.222.3 Safari/525.28.3");
        agentList.add("Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16");
        agentList.add("Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14");
        agentList.add("Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14");
        agentList.add("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0) Opera 12.14");
        agentList.add("Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02");
        agentList.add("Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00");
        agentList.add("Opera/9.80 (Windows NT 5.1; U; zh-sg) Presto/2.9.181 Version/12.00");
        agentList.add("Opera/12.0(Windows NT 5.2;U;en)Presto/22.9.168 Version/12.00");
        agentList.add("Opera/12.0(Windows NT 5.1;U;en)Presto/22.9.168 Version/12.00");
        agentList.add("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1");
        agentList.add("Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0");
        agentList.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0");
        agentList.add("Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0");
        agentList.add("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0");
        agentList.add("Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0");
        agentList.add("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.13 Safari/537.36");
        agentList.add("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3756.400 QQBrowser/10.5.4043.400");
    }

    private static String getRandomUA(){
        return agentList.get(RandomUtils.nextInt(0,agentList.size() - 1));
    }

}
